Import the Libraries¶

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import the Dataset¶

In [2]:
df = pd.read_csv('HR-Employee-Attrition.csv')
df.head()
Out[2]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [3]:
df.shape
Out[3]:
(1470, 35)
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
In [5]:
df.describe()
Out[5]:
Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 ... 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000
mean 36.923810 802.485714 9.192517 2.912925 1.0 1024.865306 2.721769 65.891156 2.729932 2.063946 ... 2.712245 80.0 0.793878 11.279592 2.799320 2.761224 7.008163 4.229252 2.187755 4.123129
std 9.135373 403.509100 8.106864 1.024165 0.0 602.024335 1.093082 20.329428 0.711561 1.106940 ... 1.081209 0.0 0.852077 7.780782 1.289271 0.706476 6.126525 3.623137 3.222430 3.568136
min 18.000000 102.000000 1.000000 1.000000 1.0 1.000000 1.000000 30.000000 1.000000 1.000000 ... 1.000000 80.0 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 30.000000 465.000000 2.000000 2.000000 1.0 491.250000 2.000000 48.000000 2.000000 1.000000 ... 2.000000 80.0 0.000000 6.000000 2.000000 2.000000 3.000000 2.000000 0.000000 2.000000
50% 36.000000 802.000000 7.000000 3.000000 1.0 1020.500000 3.000000 66.000000 3.000000 2.000000 ... 3.000000 80.0 1.000000 10.000000 3.000000 3.000000 5.000000 3.000000 1.000000 3.000000
75% 43.000000 1157.000000 14.000000 4.000000 1.0 1555.750000 4.000000 83.750000 3.000000 3.000000 ... 4.000000 80.0 1.000000 15.000000 3.000000 3.000000 9.000000 7.000000 3.000000 7.000000
max 60.000000 1499.000000 29.000000 5.000000 1.0 2068.000000 4.000000 100.000000 4.000000 5.000000 ... 4.000000 80.0 3.000000 40.000000 6.000000 4.000000 40.000000 18.000000 15.000000 17.000000

8 rows × 26 columns

In [6]:
df.Attrition.value_counts()
Out[6]:
No     1233
Yes     237
Name: Attrition, dtype: int64
In [7]:
df.Department.value_counts()
Out[7]:
Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64

Checking for Null Values¶

In [8]:
df.isnull().any()
Out[8]:
Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesLastYear       False
WorkLifeBalance             False
YearsAtCompany              False
YearsInCurrentRole          False
YearsSinceLastPromotion     False
YearsWithCurrManager        False
dtype: bool
In [9]:
df.isnull().sum()
Out[9]:
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64
In [10]:
df.drop(['EmployeeCount','EmployeeNumber','Over18','StandardHours'],axis=1,inplace=True)
In [11]:
df.shape
Out[11]:
(1470, 31)

Data Preprocessing¶

In [13]:
sns.countplot(x='Attrition',data=df)
Out[13]:
<Axes: xlabel='Attrition', ylabel='count'>
In [14]:
sns.distplot(df['Age'])
C:\Users\Arun\AppData\Local\Temp\ipykernel_17120\3255828239.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df['Age'])
Out[14]:
<Axes: xlabel='Age', ylabel='Density'>
In [15]:
plt.figure(figsize=(15,5))
sns.countplot(x='Age',hue='Attrition',data=df)
Out[15]:
<Axes: xlabel='Age', ylabel='count'>
In [16]:
sns.barplot(data=df,x='Attrition',y='Age',hue='Gender')
Out[16]:
<Axes: xlabel='Attrition', ylabel='Age'>
In [17]:
plt.pie(df.BusinessTravel.value_counts(),labels=df.BusinessTravel.unique(),autopct='%.2f%%')
plt.title("Business Travel")
Out[17]:
Text(0.5, 1.0, 'Business Travel')
In [18]:
plt.pie(df.Department.value_counts(),labels=['R&D','Sales','HR'],autopct='%1.1f%%')
plt.title('Department')
Out[18]:
Text(0.5, 1.0, 'Department')
In [19]:
plt.figure(figsize=(7,7))
plt.pie(df.EducationField.value_counts(),labels=['Life Sciences','Medical','Marketing','Technical Degree','Other','Human Resources'],autopct='%.2f%%')
plt.title("Business Travel")
Out[19]:
Text(0.5, 1.0, 'Business Travel')
In [20]:
sns.scatterplot(data=df,x='Age',y='TotalWorkingYears')
Out[20]:
<Axes: xlabel='Age', ylabel='TotalWorkingYears'>
In [21]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)
C:\Users\Arun\AppData\Local\Temp\ipykernel_17120\271047654.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(df.corr(),annot=True)
Out[21]:
<Axes: >
In [22]:
plt.figure(figsize=(30,30))
sns.pairplot(df)
Out[22]:
<seaborn.axisgrid.PairGrid at 0x25220cf2910>
<Figure size 3000x3000 with 0 Axes>

Outlier Handling¶

In [23]:
plt.figure(figsize=(15,7))
plt.subplot(3,3,1)
sns.boxplot(df.MonthlyIncome)
plt.ylabel("Monthly Income")
plt.subplot(3,3,2)
sns.boxplot(df.TotalWorkingYears)
plt.ylabel("Total Working Years")
plt.subplot(3,3,3)
sns.boxplot(df.TrainingTimesLastYear)
plt.ylabel("Training Times Last Year")
plt.subplot(3,3,4)
sns.boxplot(df.YearsAtCompany)
plt.ylabel("Years At Company")
plt.subplot(3,3,5)
sns.boxplot(df.YearsInCurrentRole)
plt.ylabel("Years In Current Role")
plt.subplot(3,3,6)
sns.boxplot(df.YearsSinceLastPromotion)
plt.ylabel("Years Since Last Promotion")
plt.subplot(3,3,7)
sns.boxplot(df.YearsWithCurrManager)
plt.ylabel("Years With Curr Manager")
plt.show()
In [24]:
q1 = df.MonthlyIncome.quantile(0.25)
q3 = df.MonthlyIncome.quantile(0.75)
IQR = q3-q1
IQR
Out[24]:
5468.0
In [25]:
upper_limit = q3+1.5*IQR
In [26]:
df['MonthlyIncome'] = np.where(df['MonthlyIncome']>upper_limit,upper_limit,df['MonthlyIncome'])
In [27]:
sns.boxplot(df.MonthlyIncome)
Out[27]:
<Axes: >
In [28]:
q1 = df.TotalWorkingYears.quantile(0.25)
q3 = df.TotalWorkingYears.quantile(0.75)
IQR = q3-q1
IQR
Out[28]:
9.0
In [29]:
upper_limit = q3+1.5*IQR
In [30]:
df['TotalWorkingYears'] = np.where(df['TotalWorkingYears']>upper_limit,upper_limit,df['TotalWorkingYears'])
In [31]:
sns.boxplot(df['TotalWorkingYears'])
Out[31]:
<Axes: >
In [32]:
q1 = df.TrainingTimesLastYear.quantile(0.25)
q3 = df.TrainingTimesLastYear.quantile(0.75)
IQR = q3-q1
IQR
Out[32]:
1.0
In [33]:
upper_limit = q3+1.5*IQR
lower_limit = q1-1.5*IQR
In [34]:
df['TrainingTimesLastYear'] = np.where(df['TrainingTimesLastYear']>upper_limit,upper_limit,np.where(df['TrainingTimesLastYear']<lower_limit,lower_limit,df['TrainingTimesLastYear']))
In [35]:
sns.boxplot(df.TrainingTimesLastYear)
Out[35]:
<Axes: >
In [36]:
q1 = df.YearsAtCompany.quantile(0.25)
q3 = df.YearsAtCompany.quantile(0.75)
IQR = q3-q1
IQR
Out[36]:
6.0
In [37]:
upper_limit = q3+1.5*IQR
In [38]:
df['YearsAtCompany'] = np.where(df['YearsAtCompany']>upper_limit,upper_limit,df['YearsAtCompany'])
In [39]:
sns.boxplot(df.YearsAtCompany)
Out[39]:
<Axes: >
In [40]:
q1 = df.YearsInCurrentRole.quantile(0.25)
q3 = df.YearsInCurrentRole.quantile(0.75)
IQR = q3-q1
IQR
Out[40]:
5.0
In [41]:
upper_limit = q3+1.5*IQR
In [42]:
df['YearsInCurrentRole'] = np.where(df['YearsInCurrentRole']>upper_limit,upper_limit,df['YearsInCurrentRole'])
In [43]:
sns.boxplot(df.YearsInCurrentRole)
Out[43]:
<Axes: >
In [44]:
q1 = df.YearsSinceLastPromotion.quantile(0.25)
q3 = df.YearsSinceLastPromotion.quantile(0.75)
IQR = q3-q1
IQR
Out[44]:
3.0
In [45]:
upper_limit = q3+1.5*IQR
In [46]:
df['YearsSinceLastPromotion'] = np.where(df['YearsSinceLastPromotion']>upper_limit,upper_limit,df['YearsSinceLastPromotion'])
In [47]:
sns.boxplot(df.YearsSinceLastPromotion)
Out[47]:
<Axes: >
In [48]:
q1 = df.YearsWithCurrManager.quantile(0.25)
q3 = df.YearsWithCurrManager.quantile(0.75)
IQR = q3-q1
IQR
Out[48]:
5.0
In [49]:
upper_limit = q3+1.5*IQR
In [50]:
df['YearsWithCurrManager'] = np.where(df['YearsWithCurrManager']>upper_limit,upper_limit,df['YearsWithCurrManager'])
In [51]:
sns.boxplot(df.YearsWithCurrManager)
Out[51]:
<Axes: >

Splitting into Dependent and Independent Variables¶

In [52]:
X = df.drop(['Attrition'],axis=1)
X.head()
Out[52]:
Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender HourlyRate ... PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 2 Female 94 ... 3 1 0 8.0 0.5 1 6.0 4.0 0.0 5.0
1 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 3 Male 61 ... 4 4 1 10.0 3.0 3 10.0 7.0 1.0 7.0
2 37 Travel_Rarely 1373 Research & Development 2 2 Other 4 Male 92 ... 3 2 0 7.0 3.0 3 0.0 0.0 0.0 0.0
3 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 4 Female 56 ... 3 3 0 8.0 3.0 3 8.0 7.0 3.0 0.0
4 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 Male 40 ... 3 4 1 6.0 3.0 3 2.0 2.0 2.0 2.0

5 rows × 30 columns

In [53]:
Y = df['Attrition']
Y.head()
Out[53]:
0    Yes
1     No
2    Yes
3     No
4     No
Name: Attrition, dtype: object

Encoding¶

In [54]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [55]:
columns = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
In [56]:
X[columns] = X[columns].apply(le.fit_transform)
X.head()
Out[56]:
Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender HourlyRate ... PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 2 1102 2 1 2 1 2 0 94 ... 3 1 0 8.0 0.5 1 6.0 4.0 0.0 5.0
1 49 1 279 1 8 1 1 3 1 61 ... 4 4 1 10.0 3.0 3 10.0 7.0 1.0 7.0
2 37 2 1373 1 2 2 4 4 1 92 ... 3 2 0 7.0 3.0 3 0.0 0.0 0.0 0.0
3 33 1 1392 1 3 4 1 4 0 56 ... 3 3 0 8.0 3.0 3 8.0 7.0 3.0 0.0
4 27 2 591 1 2 1 3 1 1 40 ... 3 4 1 6.0 3.0 3 2.0 2.0 2.0 2.0

5 rows × 30 columns

Feature Scaling¶

In [57]:
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
In [58]:
X_Scaled = pd.DataFrame(ms.fit_transform(X),columns=X.columns)
X_Scaled.head()
Out[58]:
Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender HourlyRate ... PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 0.547619 1.0 0.715820 1.0 0.000000 0.25 0.2 0.333333 0.0 0.914286 ... 0.0 0.000000 0.000000 0.280702 0.000 0.000000 0.333333 0.275862 0.000000 0.344828
1 0.738095 0.5 0.126700 0.5 0.250000 0.00 0.2 0.666667 1.0 0.442857 ... 1.0 1.000000 0.333333 0.350877 0.625 0.666667 0.555556 0.482759 0.133333 0.482759
2 0.452381 1.0 0.909807 0.5 0.035714 0.25 0.8 1.000000 1.0 0.885714 ... 0.0 0.333333 0.000000 0.245614 0.625 0.666667 0.000000 0.000000 0.000000 0.000000
3 0.357143 0.5 0.923407 0.5 0.071429 0.75 0.2 1.000000 0.0 0.371429 ... 0.0 0.666667 0.000000 0.280702 0.625 0.666667 0.444444 0.482759 0.400000 0.000000
4 0.214286 1.0 0.350036 0.5 0.035714 0.00 0.6 0.000000 1.0 0.142857 ... 0.0 1.000000 0.333333 0.210526 0.625 0.666667 0.111111 0.137931 0.266667 0.137931

5 rows × 30 columns

Splitting Data into Train and Test¶

In [59]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_Scaled,Y,test_size=0.2,random_state=0)
In [60]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
(1176, 30) (294, 30) (1176,) (294,)

Model Building¶

1. Logistic Regression¶

In [61]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
In [62]:
lr.fit(x_train,y_train)
Out[62]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [63]:
pred1 = lr.predict(x_test)
pred1
Out[63]:
array(['No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No'],
      dtype=object)
In [64]:
y_test
Out[64]:
442      No
1091     No
981     Yes
785      No
1332    Yes
       ... 
1439     No
481      No
124     Yes
198      No
1229     No
Name: Attrition, Length: 294, dtype: object

Evaluation¶

In [65]:
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score,roc_curve
In [66]:
accuracy_score(y_test,pred1)
Out[66]:
0.8775510204081632
In [67]:
pd.crosstab(y_test,pred1)
Out[67]:
col_0 No Yes
Attrition
No 241 4
Yes 32 17
In [68]:
print(classification_report(y_test,pred1))
              precision    recall  f1-score   support

          No       0.88      0.98      0.93       245
         Yes       0.81      0.35      0.49        49

    accuracy                           0.88       294
   macro avg       0.85      0.67      0.71       294
weighted avg       0.87      0.88      0.86       294

2. Decision Tree¶

In [69]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
In [70]:
dtc.fit(x_train,y_train)
Out[70]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [71]:
pred2 = dtc.predict(x_test)
pred2
Out[71]:
array(['No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes',
       'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes',
       'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No'], dtype=object)
In [72]:
y_test
Out[72]:
442      No
1091     No
981     Yes
785      No
1332    Yes
       ... 
1439     No
481      No
124     Yes
198      No
1229     No
Name: Attrition, Length: 294, dtype: object

Evaluation¶

In [73]:
accuracy_score(y_test,pred2)
Out[73]:
0.7891156462585034
In [74]:
pd.crosstab(y_test,pred2)
Out[74]:
col_0 No Yes
Attrition
No 210 35
Yes 27 22
In [75]:
print(classification_report(y_test,pred2))
              precision    recall  f1-score   support

          No       0.89      0.86      0.87       245
         Yes       0.39      0.45      0.42        49

    accuracy                           0.79       294
   macro avg       0.64      0.65      0.64       294
weighted avg       0.80      0.79      0.80       294

Hyper Paramter Tuning¶

In [76]:
from sklearn.model_selection import GridSearchCV
parameter = {'criterion':['gini','entropy'],'splitter':['best','random'],'max_depth':[1,2,3,4,5],'max_features':['auto','sqrt','log2']}
In [106]:
grid_search = GridSearchCV(estimator=dtc,param_grid=parameter,cv=5,scoring='accuracy')
In [107]:
grid_search.fit(x_train,y_train)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning: 
100 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of DecisionTreeClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the test scores are non-finite: [       nan        nan 0.84013704 0.84183916 0.83588172 0.84013704
        nan        nan 0.83673278 0.84013704 0.83843491 0.83928597
        nan        nan 0.84693112 0.84269023 0.84692752 0.83929318
        nan        nan 0.83501262 0.83843491 0.83419401 0.83504508
        nan        nan 0.82228994 0.82908763 0.84183195 0.84098449
        nan        nan 0.84013704 0.84013704 0.84013704 0.84013704
        nan        nan 0.8316264  0.8409881  0.8409881  0.84013704
        nan        nan 0.82993869 0.84439235 0.84862964 0.83758024
        nan        nan 0.84097367 0.84182834 0.84607645 0.84013704
        nan        nan 0.84353408 0.83673999 0.84609088 0.84012982]
  warnings.warn(
Out[107]:
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             scoring='accuracy')
DecisionTreeClassifier()
DecisionTreeClassifier()
In [108]:
grid_search.best_params_
Out[108]:
{'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 'log2',
 'splitter': 'best'}
In [109]:
dtc_cv = DecisionTreeClassifier(criterion='entropy',max_depth=3,max_features='log2',splitter='best')
dtc_cv.fit(x_train,y_train)
Out[109]:
DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features='log2')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features='log2')
In [110]:
pred2 = dtc_cv.predict(x_test)

Evaluation 2¶

In [111]:
accuracy_score(y_test,pred2)
Out[111]:
0.8435374149659864
In [112]:
pd.crosstab(y_test,pred2)
Out[112]:
col_0 No Yes
Attrition
No 242 3
Yes 43 6
In [113]:
print(classification_report(y_test,pred2))
              precision    recall  f1-score   support

          No       0.85      0.99      0.91       245
         Yes       0.67      0.12      0.21        49

    accuracy                           0.84       294
   macro avg       0.76      0.56      0.56       294
weighted avg       0.82      0.84      0.80       294

3. Random Forest¶

In [95]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
In [96]:
forest_params = [{'max_depth':list(range(10,15)),'max_features':list(range(0,14))}]
In [97]:
rfc_cv = GridSearchCV(rfc,param_grid=forest_params,cv=10,scoring='accuracy')
In [98]:
rfc_cv.fit(x_train,y_train)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning: 
50 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 0 instead.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the test scores are non-finite: [       nan 0.85035492 0.85714182 0.86055338 0.86309576 0.86478343
 0.85883674 0.86052441 0.86477618 0.85966247 0.86307403 0.85967695
 0.86050992 0.86306678        nan 0.84779082 0.85714907 0.85800377
 0.86222657 0.86054614 0.85966971 0.85882225 0.86221208 0.86051717
 0.85966971 0.85881501 0.85371578 0.85966971        nan 0.85119513
 0.86054614 0.85884398 0.85543242 0.85967695 0.85967695 0.85627264
 0.86053165 0.85882949 0.86136462 0.86307403 0.86221208 0.85626539
        nan 0.85034767 0.85629436 0.86140084 0.85543242 0.85885122
 0.85626539 0.86137911 0.86392873 0.86305954 0.85966971 0.86136462
 0.85882949 0.8621976         nan 0.85120962 0.85969868 0.86137911
 0.86394321 0.8605389  0.85966971 0.85714182 0.85627264 0.86051717
 0.86137911 0.85796755 0.85371578 0.85882949]
  warnings.warn(
Out[98]:
GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [10, 11, 12, 13, 14],
                          'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13]}],
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [10, 11, 12, 13, 14],
                          'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13]}],
             scoring='accuracy')
RandomForestClassifier()
RandomForestClassifier()
In [99]:
pred3 = rfc_cv.predict(x_test)

Evaluation¶

In [103]:
accuracy_score(y_test,pred3)
Out[103]:
0.8503401360544217
In [104]:
pd.crosstab(y_test,pred3)
Out[104]:
col_0 No Yes
Attrition
No 242 3
Yes 41 8
In [105]:
print(classification_report(y_test,pred3))
              precision    recall  f1-score   support

          No       0.86      0.99      0.92       245
         Yes       0.73      0.16      0.27        49

    accuracy                           0.85       294
   macro avg       0.79      0.58      0.59       294
weighted avg       0.83      0.85      0.81       294